Github account: https://github.com/EconomistInRealLife

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                     fig.width = 10, fig.height = 6)

required_packages <- c("wooldridge", "dplyr", "tidyr", "tibble", "readr", "magrittr", "ggplot2", "ggpubr", "patchwork", "scales", "corrplot","stats", "broom", "performance", "car", "sandwich", "lmtest", "psych","knitr", "stargazer", "kableExtra"
)

# Install missing packages
new_packages <- required_packages[!required_packages %in% installed.packages()[,"Package"]]
if(length(new_packages)) install.packages(new_packages)

# Load libraries
library(wooldridge)    # For dataset
library(dplyr)         # Data manipulation
library(tidyr)         # Data tidying
library(tibble)        # Modern data frames
library(readr)         # Data import/export
library(magrittr)      # Pipe operators

library(ggplot2)       # Primary visualization
library(ggpubr)        # Publication-ready plots
library(patchwork)     # Plot arrangement
library(scales)        # Axis formatting
library(corrplot)      # Correlation matrices

library(stats)         # Statistical functions
library(broom)         # Tidy model outputs
library(performance)   # Model diagnostics
library(car)           # Regression diagnostics
library(sandwich)      # Robust standard errors
library(lmtest)        # Hypothesis testing
library(psych)         # Descriptive statistics

library(knitr)         # Report generation
library(stargazer)     # Regression tables
library(kableExtra)    # Enhanced table formatting
# Introduction 
cat("***Date Source***\n")
## ***Date Source***
cat("The dataset selected is meap01, which comes from the Michigan Department of Education and it is featured in Wooldridge's (2011) textbook \"Introductory Econometrics: A Modern Approach.\" \n")
## The dataset selected is meap01, which comes from the Michigan Department of Education and it is featured in Wooldridge's (2011) textbook "Introductory Econometrics: A Modern Approach."
cat("It is an extensive school-level dataset from 2001, that emphasises the Michigan Educational Assessment Program (MEAP) for fourth graders. \n")
## It is an extensive school-level dataset from 2001, that emphasises the Michigan Educational Assessment Program (MEAP) for fourth graders.
cat("The MEAP was created to assess student achievement in core academic disciplines across Michigan's public schools. \n")
## The MEAP was created to assess student achievement in core academic disciplines across Michigan's public schools.
cat("This dataset is useful for educational policy evaluation and economic research since it includes both performance results and a variety of school-level variables, such as expenditures and socioeconomic indicators. \n")
## This dataset is useful for educational policy evaluation and economic research since it includes both performance results and a variety of school-level variables, such as expenditures and socioeconomic indicators.
cat("While the Wooldridge R package is used to retrieve the data, it is important to note that these are actual administrative records rather than synthetic or simulated observations.\n\n")
## While the Wooldridge R package is used to retrieve the data, it is important to note that these are actual administrative records rather than synthetic or simulated observations.
cat("Project Description:\n\n",
    "This project looks at the association between fourth-grade academic achievement and school-level socioeconomic characteristics in Michigan public schools.\n",
    "Using data from the meap01 dataset, the study investigates how poverty (defined by the percentage of pupils eligible for free lunch) and per-pupil spending affect standardized math test scores.\n",
    "The study's goal is to determine whether increasing educational spending improves results, particularly in high-poverty areas, and whether those benefits fade over time.\n",
    "Statistical methods such as linear and logistic regressions, interaction terms, and visualization techniques are used to identify trends, disparities, and policy-relevant insights.\n",
    "The ultimate goal is to have a better understanding of how resource allocation and poverty interact to influence student accomplishment and advise fair educational funding solutions.\n")
## Project Description:
## 
##  This project looks at the association between fourth-grade academic achievement and school-level socioeconomic characteristics in Michigan public schools.
##  Using data from the meap01 dataset, the study investigates how poverty (defined by the percentage of pupils eligible for free lunch) and per-pupil spending affect standardized math test scores.
##  The study's goal is to determine whether increasing educational spending improves results, particularly in high-poverty areas, and whether those benefits fade over time.
##  Statistical methods such as linear and logistic regressions, interaction terms, and visualization techniques are used to identify trends, disparities, and policy-relevant insights.
##  The ultimate goal is to have a better understanding of how resource allocation and poverty interact to influence student accomplishment and advise fair educational funding solutions.
cat("**Title:** The Effects of School Spending on Educational & Economic Outcomes (Quarterly Journal of Economics)\n\n")
## **Title:** The Effects of School Spending on Educational & Economic Outcomes (Quarterly Journal of Economics)
cat("**Citation (APA):** Jackson, C. K., Johnson, R. C., & Persico, C. (2016). The effects of school spending on educational and economic outcomes. *Quarterly Journal of Economics, 131*(1), 157-218. https://doi.org/10.1093/qje/qjv036\n\n")
## **Citation (APA):** Jackson, C. K., Johnson, R. C., & Persico, C. (2016). The effects of school spending on educational and economic outcomes. *Quarterly Journal of Economics, 131*(1), 157-218. https://doi.org/10.1093/qje/qjv036
cat("**Summary:** This study leverages exogenous variation in school spending from court-mandated school finance reforms to examine causal impacts. Results demonstrate that increased spending leads to significant improvements in test scores, educational attainment, and adult wages, with particularly strong effects in lower-income districts.")
## **Summary:** This study leverages exogenous variation in school spending from court-mandated school finance reforms to examine causal impacts. Results demonstrate that increased spending leads to significant improvements in test scores, educational attainment, and adult wages, with particularly strong effects in lower-income districts.
cat("\n--- Midterm Challenges: Were the Goals Achieved? ---\n")
## 
## --- Midterm Challenges: Were the Goals Achieved? ---
cat("Data Cleaning & Transformation:\n")
## Data Cleaning & Transformation:
cat("  - Cleaned relevant variables, changed row, column names as specified in the midterm, created log-transformed spending (lexppp),\n")
##   - Cleaned relevant variables, changed row, column names as specified in the midterm, created log-transformed spending (lexppp),
cat("    handled missing values, and merged in district-level data.\n\n")
##     handled missing values, and merged in district-level data.
cat("Visualization Goals:\n")
## Visualization Goals:
cat("  - Used boxplots, scatterplots, correlation matrices, and faceted plots\n")
##   - Used boxplots, scatterplots, correlation matrices, and faceted plots
cat("    to visualize relationships between poverty, spending, and test performance.\n\n")
##     to visualize relationships between poverty, spending, and test performance.
cat("Statistical Modeling Goals:\n")
## Statistical Modeling Goals:
cat("  - Built linear and logistic regression models, including interaction terms.\n")
##   - Built linear and logistic regression models, including interaction terms.
cat("  - Modeled binary outcome (80%+ math proficiency) using logistic regression.\n\n")
##   - Modeled binary outcome (80%+ math proficiency) using logistic regression.
cat("Nonlinear Effects:\n")
## Nonlinear Effects:
cat("  - Quadratic regression of log spending confirmed diminishing returns\n")
##   - Quadratic regression of log spending confirmed diminishing returns
cat("    to educational investment.\n\n")
##     to educational investment.
cat("District-Level Effects:\n")
## District-Level Effects:
cat("  - Analyzed average math scores and poverty effects at the district level\n")
##   - Analyzed average math scores and poverty effects at the district level
cat("    to complement school-level findings.\n\n")
##     to complement school-level findings.
cat("--- Next Steps in the Project ---\n")
## --- Next Steps in the Project ---
cat("• Model Refinement:\n")
## • Model Refinement:
cat("  - Consider hierarchical models or random effects for district-level variation.\n")
##   - Consider hierarchical models or random effects for district-level variation.
cat("• Causal Inference:\n")
## • Causal Inference:
cat("  - Explore IV or difference-in-differences methods for stronger causal claims.\n")
##   - Explore IV or difference-in-differences methods for stronger causal claims.
cat("• Policy Simulation:\n")
## • Policy Simulation:
cat("  - Simulate how targeted spending might affect low- vs. high-poverty schools.\n")
##   - Simulate how targeted spending might affect low- vs. high-poverty schools.
cat("• Reading Scores:\n")
## • Reading Scores:
cat("  - Apply the same framework to analyze reading performance for comparison.\n")
##   - Apply the same framework to analyze reading performance for comparison.
ls("package:wooldridge")
##   [1] "admnrev"       "affairs"       "airfare"       "alcohol"      
##   [5] "apple"         "approval"      "athlet1"       "athlet2"      
##   [9] "attend"        "audit"         "barium"        "beauty"       
##  [13] "benefits"      "beveridge"     "big9salary"    "bwght"        
##  [17] "bwght2"        "campus"        "card"          "catholic"     
##  [21] "cement"        "census2000"    "ceosal1"       "ceosal2"      
##  [25] "charity"       "consump"       "corn"          "countymurders"
##  [29] "cps78_85"      "cps91"         "crime1"        "crime2"       
##  [33] "crime3"        "crime4"        "discrim"       "driving"      
##  [37] "earns"         "econmath"      "elem94_95"     "engin"        
##  [41] "expendshares"  "ezanders"      "ezunem"        "fair"         
##  [45] "fertil1"       "fertil2"       "fertil3"       "fish"         
##  [49] "fringe"        "gpa1"          "gpa2"          "gpa3"         
##  [53] "happiness"     "hprice1"       "hprice2"       "hprice3"      
##  [57] "hseinv"        "htv"           "infmrt"        "injury"       
##  [61] "intdef"        "intqrt"        "inven"         "jtrain"       
##  [65] "jtrain2"       "jtrain3"       "jtrain98"      "k401k"        
##  [69] "k401ksubs"     "kielmc"        "labsup"        "lawsch85"     
##  [73] "loanapp"       "lowbrth"       "mathpnl"       "meap00_01"    
##  [77] "meap01"        "meap93"        "meapsingle"    "minwage"      
##  [81] "mlb1"          "mroz"          "murder"        "nbasal"       
##  [85] "ncaa_rpi"      "nyse"          "okun"          "openness"     
##  [89] "pension"       "phillips"      "pntsprd"       "prison"       
##  [93] "prminwge"      "rdchem"        "rdtelec"       "recid"        
##  [97] "rental"        "return"        "saving"        "school93_98"  
## [101] "sleep75"       "slp75_81"      "smoke"         "traffic1"     
## [105] "traffic2"      "twoyear"       "volat"         "vote1"        
## [109] "vote2"         "voucher"       "wage1"         "wage2"        
## [113] "wagepan"       "wageprc"       "wine"
data( "meap01")
df <- meap01

#PART 1: Explore the dataset
str(meap01)
## 'data.frame':    1823 obs. of  11 variables:
##  $ dcode  : num  1010 2070 2080 3010 3010 3010 3020 3020 3020 3030 ...
##  $ bcode  : int  4937 597 4860 790 1403 4056 922 2864 4851 881 ...
##  $ math4  : num  83.3 90.3 61.9 85.7 77.3 ...
##  $ read4  : num  77.8 82.3 71.4 60 59.1 ...
##  $ lunch  : num  40.6 27.1 41.8 12.8 17.1 ...
##  $ enroll : int  468 679 400 251 439 561 442 381 274 326 ...
##  $ expend : num  2747475 1505772 2121871 1211034 1913501 ...
##  $ exppp  : num  5871 2218 5305 4825 4359 ...
##  $ lenroll: num  6.15 6.52 5.99 5.53 6.08 ...
##  $ lexpend: num  14.8 14.2 14.6 14 14.5 ...
##  $ lexppp : num  8.68 7.7 8.58 8.48 8.38 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
summary(meap01)
##      dcode           bcode          math4            read4       
##  Min.   : 1010   Min.   :   1   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:33902   1st Qu.:1386   1st Qu.: 61.60   1st Qu.: 48.90  
##  Median :55100   Median :2794   Median : 76.40   Median : 62.70  
##  Mean   :53028   Mean   :3395   Mean   : 71.91   Mean   : 60.06  
##  3rd Qu.:77010   3rd Qu.:5084   3rd Qu.: 87.00   3rd Qu.: 73.90  
##  Max.   :83070   Max.   :8838   Max.   :100.00   Max.   :100.00  
##      lunch            enroll           expend            exppp      
##  Min.   :  0.00   Min.   :  62.0   Min.   : 275985   Min.   : 1207  
##  1st Qu.: 16.61   1st Qu.: 287.0   1st Qu.:1474465   1st Qu.: 4502  
##  Median : 35.10   Median : 379.0   Median :1894043   Median : 5078  
##  Mean   : 39.25   Mean   : 401.9   Mean   :2036984   Mean   : 5195  
##  3rd Qu.: 59.06   3rd Qu.: 484.0   3rd Qu.:2407636   3rd Qu.: 5767  
##  Max.   :100.00   Max.   :1496.0   Max.   :7665998   Max.   :11958  
##     lenroll         lexpend          lexppp     
##  Min.   :4.127   Min.   :12.53   Min.   :7.096  
##  1st Qu.:5.659   1st Qu.:14.20   1st Qu.:8.412  
##  Median :5.938   Median :14.45   Median :8.533  
##  Mean   :5.911   Mean   :14.44   Mean   :8.533  
##  3rd Qu.:6.182   3rd Qu.:14.69   3rd Qu.:8.660  
##  Max.   :7.311   Max.   :15.85   Max.   :9.389
# Total missing values per column
colSums(is.na(meap01))
##   dcode   bcode   math4   read4   lunch  enroll  expend   exppp lenroll lexpend 
##       0       0       0       0       0       0       0       0       0       0 
##  lexppp 
##       0
# Total missing values in the entire dataset
sum(is.na(meap01))
## [1] 0
# PART 2: How many rows and columns?
cat("Number of observations:", nrow(meap01), "\n")  
## Number of observations: 1823
cat("Number of variables:", ncol(meap01), "\n")     
## Number of variables: 11
# Data cleaning and transformation
clean_df <- df %>%
  rename(math_score = math4, reading_score = read4) %>%
  filter(math_score > 0, reading_score > 0, enroll >= 100) %>%
  mutate(
    high_poverty = ifelse(lunch > 50, 1, 0),
    expenditure_bin = ntile(exppp, 4),
    math_binary = ifelse(math_score >= 80, 1, 0),
    read_binary = ifelse(reading_score >= 80, 1, 0)
  ) %>%
  select(-c(dcode, bcode, lexpend, lenroll))

# Check structure
glimpse(clean_df)
## Rows: 1,815
## Columns: 11
## $ math_score      <dbl> 83.3, 90.3, 61.9, 85.7, 77.3, 85.2, 91.8, 87.7, 87.8, …
## $ reading_score   <dbl> 77.8, 82.3, 71.4, 60.0, 59.1, 67.0, 67.7, 80.0, 61.2, …
## $ lunch           <dbl> 40.60, 27.10, 41.75, 12.75, 17.08, 23.17, 25.57, 27.30…
## $ enroll          <int> 468, 679, 400, 251, 439, 561, 442, 381, 274, 326, 273,…
## $ expend          <dbl> 2747475, 1505772, 2121871, 1211034, 1913501, 2637483, …
## $ exppp           <dbl> 5870.673, 2217.632, 5304.678, 4824.836, 4358.772, 4701…
## $ lexppp          <dbl> 8.677725, 7.704195, 8.576344, 8.481532, 8.379946, 8.45…
## $ high_poverty    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expenditure_bin <int> 4, 1, 3, 2, 1, 2, 1, 2, 1, 3, 3, 1, 2, 2, 2, 2, 1, 1, …
## $ math_binary     <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, …
## $ read_binary     <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Summary statistics
summary_stats <- clean_df %>%
  select(math_score, reading_score, lunch, enroll, exppp, lexppp) %>%
  psych::describe(quant = c(.25, .75)) %>%
  select(-vars, -trimmed, -mad, -range)

print(summary_stats)
##                  n    mean      sd  median     min      max  skew kurtosis
## math_score    1815   72.00   19.84   76.40    3.60   100.00 -0.96     0.45
## reading_score 1815   60.13   19.07   62.70    3.40   100.00 -0.63    -0.04
## lunch         1815   39.16   26.42   34.99    0.00   100.00  0.47    -0.85
## enroll        1815  403.12  169.13  380.00  103.00  1496.00  1.36     3.89
## exppp         1815 5193.24 1087.35 5078.25 1206.88 11957.64  0.75     3.04
## lexppp        1815    8.53    0.21    8.53    7.10     9.39 -0.69     4.16
##                  se   Q0.25   Q0.75
## math_score     0.47   61.70   87.00
## reading_score  0.45   48.90   73.90
## lunch          0.62   16.59   58.97
## enroll         3.97  287.50  485.00
## exppp         25.52 4503.86 5766.08
## lexppp         0.01    8.41    8.66
# Outlier detection
outlier_plot <- clean_df %>%
  select(math_score, reading_score, lunch, exppp) %>%
  pivot_longer(everything()) %>%
  ggplot(aes(y = value)) +
  geom_boxplot(fill = "skyblue") +
  facet_wrap(~name, scales = "free") +
  labs(title = "Distribution of Key Variables", 
       subtitle = "Outlier Detection",
       y = "Value") +
  theme_minimal()

# Distribution plots
dist_plot <- clean_df %>%
  select(math_score, reading_score, lunch, lexppp) %>%
  pivot_longer(everything()) %>%
  ggplot(aes(x = value)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_density(color = "darkred", linewidth = 1) +
  facet_wrap(~name, scales = "free") +
  labs(title = "Distribution of Key Variables", x = "") +
  theme_minimal()

# Combine plots
(outlier_plot | dist_plot) + plot_annotation(title = "Initial Data Exploration")

# Summary of Initial Data Analysis:
# The initial data analysis finds significant patterns in student success 
# and school resources across Michigan's public schools. Boxplots indicate that, 
# while variables such as poverty levels (lunch) and test scores (math_score, 
# reading_score) have a moderate distribution, there are noteworthy outliers, 
# particularly in schools with exceptionally low performance or unusually high 
# per-pupil spending. These findings support the decision to use log transformations 
# and robust regressions in later analyses, and they emphasize the necessity of 
# testing for interaction effects between poverty and spending. Overall, the data 
# distributions support the modeling technique and indicate significant variation 
# in how resources and poverty affect academic achievement.
# Fixed plots with proper syntax
poverty_plot <- ggplot(clean_df, aes(x = lunch, y = math_score)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "loess", color = "firebrick", se = FALSE) +
  geom_smooth(method = "lm", color = "darkgreen", linetype = "dashed", se = FALSE) +
  labs(title = "Math Performance vs. Poverty Levels",
       x = "Percentage Eligible for Free Lunch",
       y = "Math Proficiency (%)") +
  theme_minimal()

spending_plot <- ggplot(clean_df, aes(x = exppp, y = math_score)) +
  geom_point(alpha = 0.5, aes(color = factor(high_poverty))) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_continuous(labels = scales::dollar) +  # Fixed dollar formatting
  labs(title = "Math Performance vs. Per-Pupil Spending",
       x = "Per-Pupil Expenditure (USD)",
       y = "Math Proficiency (%)",
       color = "High Poverty") +
  scale_color_discrete(labels = c("No", "Yes")) +  # Better legend labels
  theme_minimal() +
  theme(legend.position = "bottom")

interaction_plot <- ggplot(clean_df, aes(x = lexppp, y = math_score)) +
  geom_point(alpha = 0.3) +
  geom_smooth(aes(group = factor(high_poverty)),  # Fixed parentheses
              method = "lm", se = FALSE) +
  facet_wrap(~high_poverty, 
             labeller = as_labeller(c("0" = "Low Poverty", "1" = "High Poverty"))) +
  labs(title = "Math Performance vs. Log Spending by Poverty Status",
       x = "Log(Per-Pupil Expenditure)",
       y = "Math Proficiency (%)") +
  theme_minimal()

# Arrange plots with patchwork
poverty_plot / (spending_plot | interaction_plot)

# Visualization Insights:
# The visualizations provide vital insights about the relationship between school 
# poverty, spending, and student math performance. The top plot demonstrates a 
# clear negative link between the percentage of pupils eligible for free lunch 
# (a proxy for poverty) and math competency, showing that higher poverty rates 
# are substantially associated with poorer academic outcomes.
# 
# The bottom-left plot shows a weak positive relationship between per-pupil spending 
# and math achievement. However, clustering of data points and color coding by poverty 
# level reveal that high-poverty schools consistently cluster at lower proficiency 
# levels, regardless of spending.
#
# The bottom-right faceted plot (using log-transformed spending) shows that in 
# low-poverty schools, additional spending is more clearly associated with higher 
# math performance. In contrast, the relationship flattens in high-poverty schools, 
# suggesting diminishing marginal returns in disadvantaged settings.
#
# These findings emphasize the need for poverty-sensitive policies beyond just 
# increasing funding.
# Correlation matrix
cor_matrix <- clean_df %>%
  select(math_score, reading_score, lunch, enroll, exppp, lexppp) %>%
  cor(use = "complete.obs")

corrplot(cor_matrix, method = "color", type = "upper",
         tl.col = "darkblue", addCoef.col = "black",
         number.cex = 0.7, 
         title = "Correlation Matrix of Key Variables")

# Correlation Matrix Insights:
# The matrix highlights key relationships between core variables in the dataset.
# Math and reading proficiency are highly correlated (r = 0.84), indicating that 
# strong performance in one subject often predicts success in the other.
# 
# Both scores show a strong negative correlation with the percentage of students 
# eligible for free lunch (math: r = -0.60; reading: r = -0.61), reaffirming 
# that higher poverty levels are linked to lower academic achievement.
# 
# Per-pupil spending (exppp) and its log form (lexppp) are nearly perfectly 
# correlated (r = 0.97), suggesting multicollinearity issues if used together 
# in the same model.
# 
# School enrollment is moderately negatively correlated with per-pupil spending, 
# indicating smaller schools may spend more per student.
# 
# These findings validate theoretical expectations and help guide regression 
# model specifications—especially regarding variable selection and 
# socioeconomic influences on achievement.

Here I answered my research questions without looking at district level effects.

# ---- Run all models ----
# Model 1: Poverty + Log Spending
model1 <- lm(math_score ~ lunch + lexppp, data = clean_df)

# Model 2: Poverty + Log Spending + Enrollment
model2 <- lm(math_score ~ lunch + lexppp + enroll, data = clean_df)

# Model 3: Poverty + Log Spending + Spending Bins
model3 <- lm(math_score ~ lunch + lexppp + expenditure_bin, data = clean_df)

# Model 4: Diminishing Returns (Quadratic)
model4 <- lm(math_score ~ lexppp + I(lexppp^2), data = clean_df)

# Model 5: Linear Spending
model5 <- lm(math_score ~ exppp, data = clean_df)

# Model 6: Log Spending
model6 <- lm(math_score ~ lexppp, data = clean_df)

# Model 7: Logistic Regression for High Performance
logit_model <- glm(math_binary ~ lexppp + lunch + enroll,
                   data = clean_df, family = binomial)

# ---- Display results for all linear models ----
stargazer(model1, model2, model3, model4, model5, model6,
          type = "text",
          title = "Regression Models for Math Proficiency",
          covariate.labels = c(
            "Poverty (% free lunch)",
            "Log Per-Pupil Spending",
            "Enrollment",
            "Spending Bin: Q2",
            "Spending Bin: Q3",
            "Spending Bin: Q4",
            "Spending Squared",
            "Per-Pupil Spending"
          ),
          dep.var.labels = "Math Proficiency (%)",
          column.labels = c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6"),
          notes = c("Standard errors in parentheses", 
                    "* p<0.05, ** p<0.01, *** p<0.001"),
          align = TRUE,
          omit.stat = c("f", "ser"),
          no.space = TRUE)
## 
## Regression Models for Math Proficiency
## ====================================================================================
##                                             Dependent variable:                     
##                        -------------------------------------------------------------
##                                            Math Proficiency (%)                     
##                         Model 1   Model 2   Model 3    Model 4    Model 5   Model 6 
##                           (1)       (2)       (3)        (4)        (5)       (6)   
## ------------------------------------------------------------------------------------
## Poverty (% free lunch) -0.468*** -0.462*** -0.470***                                
##                         (0.014)   (0.014)   (0.014)                                 
## Log Per-Pupil Spending 11.354*** 8.161***   6.759*   205.155***             -1.482  
##                         (1.772)   (1.856)   (3.481)   (71.832)              (2.174) 
## Enrollment                       -0.012***                                          
##                                   (0.002)                                           
## Spending Bin: Q2                             1.029                                  
##                                             (0.671)                                 
## Spending Bin: Q3                                     -12.214***                     
##                                                        (4.244)                      
## Spending Bin: Q4                                                  -0.001            
##                                                                  (0.0004)           
## Spending Squared        -6.556    25.428    30.178   -788.707*** 74.860*** 84.644***
##                        (15.007)  (16.043)  (28.264)   (304.023)   (2.273)  (18.557) 
## ------------------------------------------------------------------------------------
## Observations             1,815     1,815     1,815      1,815      1,815     1,815  
## R2                       0.369     0.379     0.370      0.005      0.001    0.0003  
## Adjusted R2              0.369     0.378     0.369      0.004     0.0004    -0.0003 
## ====================================================================================
## Note:                                                    *p<0.1; **p<0.05; ***p<0.01
##                                                       Standard errors in parentheses
##                                                     * p<0.05, ** p<0.01, *** p<0.001
# ---- Display logistic regression results ----
cat("\n\n--- Logistic Regression for High Math Performance (≥80%) ---\n")
## 
## 
## --- Logistic Regression for High Math Performance (≥80%) ---
logit_tidy <- tidy(logit_model, conf.int = TRUE, exponentiate = TRUE)
logit_tidy %>%
  mutate(across(where(is.numeric), ~ round(., 3))) %>%
  select(term, estimate, std.error, p.value, conf.low, conf.high) %>%
  kable(caption = "Odds Ratios for Achieving High Math Performance") %>%
  print()
## 
## 
## Table: Odds Ratios for Achieving High Math Performance
## 
## |term        | estimate| std.error| p.value| conf.low| conf.high|
## |:-----------|--------:|---------:|-------:|--------:|---------:|
## |(Intercept) |    0.000|     2.451|   0.001|    0.000|     0.048|
## |lexppp      |    2.987|     0.284|   0.000|    1.721|     5.238|
## |lunch       |    0.959|     0.002|   0.000|    0.954|     0.963|
## |enroll      |    0.999|     0.000|   0.051|    0.999|     1.000|
# ---- Model comparisons ----
cat("\n\n--- Comparison of Spending Specifications ---\n")
## 
## 
## --- Comparison of Spending Specifications ---
model_comp <- data.frame(
  Model = c("Model 5: Linear Spending", "Model 6: Log Spending"),
  R2 = c(summary(model5)$r.squared, summary(model6)$r.squared),
  Adj_R2 = c(summary(model5)$adj.r.squared, summary(model6)$adj.r.squared),
  AIC = c(AIC(model5), AIC(model6)),
  BIC = c(BIC(model5), BIC(model6))
) %>%
  mutate(across(where(is.numeric), ~ round(., 3)))

kable(model_comp, caption = "Model Fit Comparison")
Model Fit Comparison
Model R2 Adj_R2 AIC BIC
Model 5: Linear Spending 0.001 0 15999.63 16016.14
Model 6: Log Spending 0.000 0 16000.82 16017.33
# ---- Model diagnostics ----
# Model 2 diagnostics
cat("\n\n--- Diagnostics for Model 2 ---\n")
## 
## 
## --- Diagnostics for Model 2 ---
# Multicollinearity
vif_results <- vif(model2)
cat("\nVariance Inflation Factors (VIF):\n")
## 
## Variance Inflation Factors (VIF):
print(vif_results)
##    lunch   lexppp   enroll 
## 1.057502 1.172685 1.114651
# Robust standard errors
cat("\nRobust Standard Errors (HC1):\n")
## 
## Robust Standard Errors (HC1):
robust_results <- coeftest(model2, vcov = vcovHC(model2, type = "HC1"))
print(robust_results)
## 
## t test of coefficients:
## 
##               Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) 25.4284930 19.0687411   1.3335 0.1825299    
## lunch       -0.4624098  0.0164108 -28.1771 < 2.2e-16 ***
## lexppp       8.1613131  2.1925407   3.7223 0.0002034 ***
## enroll      -0.0122980  0.0027854  -4.4152 1.069e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("
### Key Findings from Analysis

1. **Poverty Effect**: Each 10 percentage point increase in free lunch eligibility is associated with a 4.7-point decrease in average math proficiency (β ≈ -0.47, p < 0.001), suggesting a strong negative relationship between poverty and achievement.

2. **Spending Impact**: Models using log-transformed per-pupil spending (e.g., Model 2, R² = 0.379) explain considerably more variance in student performance than models using raw or binned spending levels (e.g., Model 5, R² = 0.001).

3. **Nonlinear Effects**: Squared spending terms in Models 5 and 6 are positive but contribute little to explanatory power, indicating no strong evidence of diminishing or accelerating returns in the quadratic specifications.

4. **Multicollinearity & Robustness**: Variance inflation factors (VIFs) for key predictors (lunch, lexppp, and enrollment) are all below 2, indicating low multicollinearity. Robust standard errors confirm the statistical significance of key predictors in Model 2.

5. **Logistic Regression (If applicable)**: *Logistic models predicting high math performance (≥80%) show that higher poverty levels are significantly associated with lower odds of high proficiency.*
")
## 
## ### Key Findings from Analysis
## 
## 1. **Poverty Effect**: Each 10 percentage point increase in free lunch eligibility is associated with a 4.7-point decrease in average math proficiency (β ≈ -0.47, p < 0.001), suggesting a strong negative relationship between poverty and achievement.
## 
## 2. **Spending Impact**: Models using log-transformed per-pupil spending (e.g., Model 2, R² = 0.379) explain considerably more variance in student performance than models using raw or binned spending levels (e.g., Model 5, R² = 0.001).
## 
## 3. **Nonlinear Effects**: Squared spending terms in Models 5 and 6 are positive but contribute little to explanatory power, indicating no strong evidence of diminishing or accelerating returns in the quadratic specifications.
## 
## 4. **Multicollinearity & Robustness**: Variance inflation factors (VIFs) for key predictors (lunch, lexppp, and enrollment) are all below 2, indicating low multicollinearity. Robust standard errors confirm the statistical significance of key predictors in Model 2.
## 
## 5. **Logistic Regression (If applicable)**: *Logistic models predicting high math performance (≥80%) show that higher poverty levels are significantly associated with lower odds of high proficiency.*
# Research Questions and Analysis Plan
cat("
# Research Design

## Research Questions

This study examines the relationship between school resources, socioeconomic factors, and academic performance using Michigan's 2001 MEAP data. The following questions 
guide the investigation:

1. **Poverty and Spending Effects**: Does 4th-grade math performance vary by poverty level,and does per-pupil spending significantly impact test scores?
   
2. **Poverty's Persistent Impact**: After controlling for spending levels, why do schools 
   with higher poverty rates consistently show lower performance?
   
3. **Subject-Specific Poverty Gaps**: Do reading and math performance differ between high-poverty and low-poverty schools?
   
4. **Demographic Effects**: How do student demographics affect academic performance in schools with similar budget levels?
   
5. **Diminishing Returns**: Does increased per-student expenditure show diminishing marginal returns on academic achievement?
   
6. **Functional Form**: Is academic performance more strongly associated with 
   log-transformed spending measures than raw dollar amounts?

## Planned Statistical Analyses

To address these questions, the following analytical approaches will be employed:

### Regression Modeling
- **Multiple Linear Regression**: Primary analysis for continuous outcomes (math4/read4)
  - Core model: `math_score ~ poverty + spending + enrollment`
  - Extended models with interaction terms and polynomial specifications
- **Logistic Regression**: For binary outcomes (e.g., 80% proficiency threshold)
  - Model: `I(math_score >= 80) ~ spending + poverty + enrollment`

### Key Specifications
- **Spending Representation**: 
  - Linear: Per-pupil expenditure (exppp)
  - Log-transformed: Lexppp (log per-pupil spending)
- **Poverty Interactions**: 
  - High-poverty indicator (lunch > 50%)
  - Spending × poverty interaction terms
- **Non-Linear Terms**: 
  - Quadratic spending term: `I(lexppp^2)`
  - Spending quartile indicators

### Analytical Techniques
1. **Visual Analysis**: Scatterplots with smoothing curves, boxplots by poverty status
2. **Correlation Analysis**: Heatmaps of variable relationships
3. **Model Comparison**: AIC/BIC for linear vs log spending specifications
4. **Diagnostic Testing**:
   - Variance Inflation Factors (VIF) for multicollinearity
   - Breusch-Pagan test for heteroscedasticity
   - Robust standard errors (HC1)
   - Residual analysis for model fit

### Causal Inference Considerations
- **Limitations**: Acknowledgment of observational data constraints
- **Robustness Checks**: 
  - Alternative model specifications
  - Subgroup analyses by poverty level
  - District fixed effects (using dcodes)
")
## 
## # Research Design
## 
## ## Research Questions
## 
## This study examines the relationship between school resources, socioeconomic factors, and academic performance using Michigan's 2001 MEAP data. The following questions 
## guide the investigation:
## 
## 1. **Poverty and Spending Effects**: Does 4th-grade math performance vary by poverty level,and does per-pupil spending significantly impact test scores?
##    
## 2. **Poverty's Persistent Impact**: After controlling for spending levels, why do schools 
##    with higher poverty rates consistently show lower performance?
##    
## 3. **Subject-Specific Poverty Gaps**: Do reading and math performance differ between high-poverty and low-poverty schools?
##    
## 4. **Demographic Effects**: How do student demographics affect academic performance in schools with similar budget levels?
##    
## 5. **Diminishing Returns**: Does increased per-student expenditure show diminishing marginal returns on academic achievement?
##    
## 6. **Functional Form**: Is academic performance more strongly associated with 
##    log-transformed spending measures than raw dollar amounts?
## 
## ## Planned Statistical Analyses
## 
## To address these questions, the following analytical approaches will be employed:
## 
## ### Regression Modeling
## - **Multiple Linear Regression**: Primary analysis for continuous outcomes (math4/read4)
##   - Core model: `math_score ~ poverty + spending + enrollment`
##   - Extended models with interaction terms and polynomial specifications
## - **Logistic Regression**: For binary outcomes (e.g., 80% proficiency threshold)
##   - Model: `I(math_score >= 80) ~ spending + poverty + enrollment`
## 
## ### Key Specifications
## - **Spending Representation**: 
##   - Linear: Per-pupil expenditure (exppp)
##   - Log-transformed: Lexppp (log per-pupil spending)
## - **Poverty Interactions**: 
##   - High-poverty indicator (lunch > 50%)
##   - Spending × poverty interaction terms
## - **Non-Linear Terms**: 
##   - Quadratic spending term: `I(lexppp^2)`
##   - Spending quartile indicators
## 
## ### Analytical Techniques
## 1. **Visual Analysis**: Scatterplots with smoothing curves, boxplots by poverty status
## 2. **Correlation Analysis**: Heatmaps of variable relationships
## 3. **Model Comparison**: AIC/BIC for linear vs log spending specifications
## 4. **Diagnostic Testing**:
##    - Variance Inflation Factors (VIF) for multicollinearity
##    - Breusch-Pagan test for heteroscedasticity
##    - Robust standard errors (HC1)
##    - Residual analysis for model fit
## 
## ### Causal Inference Considerations
## - **Limitations**: Acknowledgment of observational data constraints
## - **Robustness Checks**: 
##   - Alternative model specifications
##   - Subgroup analyses by poverty level
##   - District fixed effects (using dcodes)

Regression Summary:

Poverty shows a strong and consistent negative effect on math scores.

A 10% increase in students eligible for free lunch is linked to a ~4.7 point drop

in math performance. While higher per-pupil spending (especially in log terms)

is associated with improved outcomes, the impact weakens when adjusting

for other variables like school size.

Models using categorical or raw spending measures perform poorly,

reinforcing that poverty is the most reliable and robust predictor of student achievement.

Logistic Regression Summary:

Higher per-pupil spending significantly increases the odds of high math proficiency.

A one-unit increase in log spending nearly triples the odds (OR = 2.99, p < 0.001).

In contrast, higher poverty (free lunch rate) reduces the odds by ~4% per percentage point (OR = 0.96, p < 0.001).

Enrollment has a marginal negative effect (OR = 0.999, p = 0.051).

Overall, spending is a strong positive and poverty a strong negative predictor of academic success.

# Generate regression tables
stargazer::stargazer(model1, model2, model3, model4,
                     type = "text",
                     title = "Regression Results: Math Performance Determinants",
                     covariate.labels = c("Poverty", "Log Spending", "Enrollment", 
                                          "Spending Squared", "Spending Bin Q2", 
                                          "Spending Bin Q3", "Spending Bin Q4"),
                     dep.var.labels = "Math Proficiency (%)")
## 
## Regression Results: Math Performance Determinants
## ========================================================================================================================
##                                                             Dependent variable:                                         
##                     ----------------------------------------------------------------------------------------------------
##                                                             Math Proficiency (%)                                        
##                                (1)                       (2)                       (3)                     (4)          
## ------------------------------------------------------------------------------------------------------------------------
## Poverty                     -0.468***                 -0.462***                 -0.470***                               
##                              (0.014)                   (0.014)                   (0.014)                                
##                                                                                                                         
## Log Spending                11.354***                 8.161***                   6.759*                 205.155***      
##                              (1.772)                   (1.856)                   (3.481)                 (71.832)       
##                                                                                                                         
## Enrollment                                            -0.012***                                                         
##                                                        (0.002)                                                          
##                                                                                                                         
## Spending Squared                                                                  1.029                                 
##                                                                                  (0.671)                                
##                                                                                                                         
## Spending Bin Q2                                                                                         -12.214***      
##                                                                                                          (4.244)        
##                                                                                                                         
## Spending Bin Q3              -6.556                    25.428                    30.178                -788.707***      
##                             (15.007)                  (16.043)                  (28.264)                (304.023)       
##                                                                                                                         
## ------------------------------------------------------------------------------------------------------------------------
## Observations                  1,815                     1,815                     1,815                   1,815         
## R2                            0.369                     0.379                     0.370                   0.005         
## Adjusted R2                   0.369                     0.378                     0.369                   0.004         
## Residual Std. Error    15.765 (df = 1812)        15.646 (df = 1811)        15.760 (df = 1811)       19.804 (df = 1812)  
## F Statistic         530.557*** (df = 2; 1812) 368.710*** (df = 3; 1811) 354.752*** (df = 3; 1811) 4.375** (df = 2; 1812)
## ========================================================================================================================
## Note:                                                                                        *p<0.1; **p<0.05; ***p<0.01
# Visualize diminishing returns
ggplot(clean_df, aes(x = exppp, y = math_score)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "darkred") +
  scale_x_continuous(labels = dollar) +
  labs(title = "Diminishing Returns to Educational Spending",
       subtitle = "Quadratic relationship between spending and math proficiency",
       x = "Per-Pupil Expenditure (USD)",
       y = "Math Proficiency (%)") +
  theme_minimal()

# Poverty-performance gap visualization
clean_df %>%
  group_by(high_poverty) %>%
  summarise(Math = mean(math_score), Reading = mean(reading_score)) %>%
  pivot_longer(cols = c(Math, Reading)) %>%
  ggplot(aes(x = factor(high_poverty), y = value, fill = name)) +
  geom_col(position = "dodge") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Academic Performance Gap by Poverty Status",
       x = "High Poverty School (>50% free lunch)",
       y = "Average Proficiency (%)",
       fill = "Subject") +
  theme_minimal()

# Diminishing Returns to Spending:
# The scatterplot shows a clear quadratic relationship between per-pupil spending and math proficiency.
# Performance rises with spending up to an optimal point, then plateaus or declines.
# This supports the regression finding: a negative squared term for log spending.
# Conclusion: More spending alone does not guarantee better outcomes—effective resource use matters more.

# Poverty and Achievement Gap:
# The bar chart highlights a stark performance gap between low- and high-poverty schools.
# Students in low-poverty schools score about 21 points higher in math on average.
# This supports the regression results showing a strong negative link between poverty and academic outcomes.
# Poverty remains one of the most consistent predictors of lower performance.

Exploring district level effects using dcode variable

ls("package:wooldridge")
##   [1] "admnrev"       "affairs"       "airfare"       "alcohol"      
##   [5] "apple"         "approval"      "athlet1"       "athlet2"      
##   [9] "attend"        "audit"         "barium"        "beauty"       
##  [13] "benefits"      "beveridge"     "big9salary"    "bwght"        
##  [17] "bwght2"        "campus"        "card"          "catholic"     
##  [21] "cement"        "census2000"    "ceosal1"       "ceosal2"      
##  [25] "charity"       "consump"       "corn"          "countymurders"
##  [29] "cps78_85"      "cps91"         "crime1"        "crime2"       
##  [33] "crime3"        "crime4"        "discrim"       "driving"      
##  [37] "earns"         "econmath"      "elem94_95"     "engin"        
##  [41] "expendshares"  "ezanders"      "ezunem"        "fair"         
##  [45] "fertil1"       "fertil2"       "fertil3"       "fish"         
##  [49] "fringe"        "gpa1"          "gpa2"          "gpa3"         
##  [53] "happiness"     "hprice1"       "hprice2"       "hprice3"      
##  [57] "hseinv"        "htv"           "infmrt"        "injury"       
##  [61] "intdef"        "intqrt"        "inven"         "jtrain"       
##  [65] "jtrain2"       "jtrain3"       "jtrain98"      "k401k"        
##  [69] "k401ksubs"     "kielmc"        "labsup"        "lawsch85"     
##  [73] "loanapp"       "lowbrth"       "mathpnl"       "meap00_01"    
##  [77] "meap01"        "meap93"        "meapsingle"    "minwage"      
##  [81] "mlb1"          "mroz"          "murder"        "nbasal"       
##  [85] "ncaa_rpi"      "nyse"          "okun"          "openness"     
##  [89] "pension"       "phillips"      "pntsprd"       "prison"       
##  [93] "prminwge"      "rdchem"        "rdtelec"       "recid"        
##  [97] "rental"        "return"        "saving"        "school93_98"  
## [101] "sleep75"       "slp75_81"      "smoke"         "traffic1"     
## [105] "traffic2"      "twoyear"       "volat"         "vote1"        
## [109] "vote2"         "voucher"       "wage1"         "wage2"        
## [113] "wagepan"       "wageprc"       "wine"
data( "meap01")
df <- meap01
df <- meap01 %>%
  rename(math_score = math4, 
         reading_score = read4,
         district_code = dcode) %>%
  filter(math_score > 0, reading_score > 0, enroll >= 100) %>%
  mutate(
    high_poverty = ifelse(lunch > median(lunch, na.rm = TRUE), 1, 0),
    spending_quartile = factor(ntile(exppp, 4)),
    log_spending = log(exppp),
    log_enroll = log(enroll)
  )

# Add district-level aggregates
district_df <- df %>%
  group_by(district_code) %>%
  summarise(
    district_poverty = mean(lunch, na.rm = TRUE),
    district_math = mean(math_score, na.rm = TRUE),
    district_read = mean(reading_score, na.rm = TRUE),
    district_spend = mean(exppp, na.rm = TRUE),
    n_schools = n()
  ) %>%
  filter(n_schools >= 3)  # Only districts with ≥3 schools
# Merge back to original data
df <- df %>% left_join(district_df, by = "district_code")

### Research Question 1: Poverty and Spending Effects
q1_model <- lm(math_score ~ lunch + log_spending, data = df)
q1_district <- lm(district_math ~ district_poverty + district_spend, data = district_df)

### Research Question 2: Poverty Effect Controlling for Spending
q2_model <- lm(math_score ~ lunch + log_spending + factor(spending_quartile), data = df)

### Research Question 3: Performance in High vs Low Poverty Schools
q3_ttest <- t.test(math_score ~ high_poverty, data = df)
q3_plot <- ggplot(df, aes(x = factor(high_poverty), y = math_score)) +
  geom_boxplot() +
  labs(x = "High Poverty School", y = "Math Score") +
  scale_x_discrete(labels = c("Low", "High"))

### Research Question 4: Demographics Effects Within Spending Groups
q4_model <- lm(math_score ~ lunch + log_enroll + factor(spending_quartile), data = df)

### Research Question 5: Diminishing Returns to Spending
q5_model <- lm(math_score ~ log_spending + I(log_spending^2), data = df)
inflection_point <- -coef(q5_model)[2]/(2*coef(q5_model)[3])
optimal_spending <- exp(inflection_point)

### Research Question 6: Log vs Linear Spending
q6_linear <- lm(math_score ~ exppp, data = df)
q6_log <- lm(math_score ~ log_spending, data = df)

# Model comparison
model_comp <- data.frame(
  Model = c("Linear Spending", "Log Spending"),
  R2 = c(summary(q6_linear)$r.squared, summary(q6_log)$r.squared),
  AIC = c(AIC(q6_linear), AIC(q6_log)),
  BIC = c(BIC(q6_linear), BIC(q6_log))
)

# Regression Model Diagnostics:
# VIFs in the Q2 model are all below 5, indicating no major multicollinearity concerns.
# In Q1, robust (HC1) standard errors confirm significant effects of:
#  - Poverty (β = -0.47, p < 0.001)
#  - Log Spending (β ≈ 11.35, p < 0.001)
# These diagnostics enhance confidence in the model's validity and interpretation.
# ===================== DISPLAY REGRESSION RESULTS =====================

# === RESEARCH QUESTION 1 ===
stargazer(q1_model, q1_district,
          type = "text",
          title = "RQ1: Poverty and Spending Effects (School vs District)",
          column.labels = c("School-Level", "District-Level"),
          covariate.labels = c("Poverty Rate", "Log Spending", "District Poverty", "District Spending"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ1: Poverty and Spending Effects (School vs District)
## ======================================================================
##                                    Dependent variable:                
##                     --------------------------------------------------
##                       Math Proficiency (%)         district_math      
##                           School-Level             District-Level     
##                                (1)                      (2)           
## ----------------------------------------------------------------------
## Poverty Rate                -0.468***                                 
##                              (0.014)                                  
##                                                                       
## Log Spending                11.354***                                 
##                              (1.772)                                  
##                                                                       
## District Poverty                                     -0.424***        
##                                                       (0.028)         
##                                                                       
## District Spending                                     0.002**         
##                                                       (0.001)         
##                                                                       
## Constant                     -6.556                  81.902***        
##                             (15.007)                  (4.090)         
##                                                                       
## ----------------------------------------------------------------------
## Observations                  1,815                     192           
## R2                            0.369                    0.547          
## Adjusted R2                   0.369                    0.542          
## Residual Std. Error    15.765 (df = 1812)         8.095 (df = 189)    
## F Statistic         530.557*** (df = 2; 1812) 113.985*** (df = 2; 189)
## ======================================================================
## Note:                                      *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 2 ===
stargazer(q2_model,
          type = "text",
          title = "RQ2: Poverty Effect Controlling for Spending Quartiles",
          covariate.labels = c("Poverty Rate", "Log Spending", 
                               "Spending Q2", "Spending Q3", "Spending Q4"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ2: Poverty Effect Controlling for Spending Quartiles
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                        Math Proficiency (%)    
## -----------------------------------------------
## Poverty Rate                 -0.469***         
##                               (0.014)          
##                                                
## Log Spending                  6.802*           
##                               (3.521)          
##                                                
## Spending Q2                   2.213*           
##                               (1.258)          
##                                                
## Spending Q3                   3.527**          
##                               (1.545)          
##                                                
## Spending Q4                    2.933           
##                               (2.098)          
##                                                
## Constant                      30.145           
##                              (29.136)          
##                                                
## -----------------------------------------------
## Observations                   1,815           
## R2                             0.371           
## Adjusted R2                    0.370           
## Residual Std. Error     15.752 (df = 1809)     
## F Statistic          213.786*** (df = 5; 1809) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 4 ===
stargazer(q4_model,
          type = "text",
          title = "RQ4: Demographic Effects Within Spending Groups",
          covariate.labels = c("Poverty Rate", "Log Enrollment", 
                               "Spending Q2", "Spending Q3", "Spending Q4"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ4: Demographic Effects Within Spending Groups
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                        Math Proficiency (%)    
## -----------------------------------------------
## Poverty Rate                 -0.469***         
##                               (0.014)          
##                                                
## Log Enrollment               -5.369***         
##                               (0.915)          
##                                                
## Spending Q2                  2.864***          
##                               (1.044)          
##                                                
## Spending Q3                  4.670***          
##                               (1.059)          
##                                                
## Spending Q4                  5.041***          
##                               (1.090)          
##                                                
## Constant                    118.980***         
##                               (5.621)          
##                                                
## -----------------------------------------------
## Observations                   1,815           
## R2                             0.382           
## Adjusted R2                    0.380           
## Residual Std. Error     15.621 (df = 1809)     
## F Statistic          223.526*** (df = 5; 1809) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 5 ===
stargazer(q5_model,
          type = "text",
          title = "RQ5: Diminishing Returns to Spending",
          covariate.labels = c("Log Spending", "Log Spending Squared"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ5: Diminishing Returns to Spending
## ================================================
##                          Dependent variable:    
##                      ---------------------------
##                         Math Proficiency (%)    
## ------------------------------------------------
## Log Spending                 205.155***         
##                               (71.832)          
##                                                 
## Log Spending Squared         -12.214***         
##                                (4.244)          
##                                                 
## Constant                     -788.707***        
##                               (304.023)         
##                                                 
## ------------------------------------------------
## Observations                    1,815           
## R2                              0.005           
## Adjusted R2                     0.004           
## Residual Std. Error      19.804 (df = 1812)     
## F Statistic            4.375** (df = 2; 1812)   
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
# Display Optimal Spending
inflection_point <- -coef(q5_model)[2] / (2 * coef(q5_model)[3])
optimal_spending <- exp(inflection_point)
cat("\nOptimal Spending Point: $", round(optimal_spending, 0), "per pupil\n")
## 
## Optimal Spending Point: $ 4440 per pupil
# === RESEARCH QUESTION 6 ===
stargazer(q6_linear, q6_log,
          type = "text",
          title = "RQ6: Log vs Linear Spending Models",
          column.labels = c("Linear", "Log"),
          covariate.labels = c("Per-Pupil Spending", "Log Spending"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ6: Log vs Linear Spending Models
## ============================================================
##                                     Dependent variable:     
##                                 ----------------------------
##                                     Math Proficiency (%)    
##                                     Linear          Log     
##                                       (1)           (2)     
## ------------------------------------------------------------
## Per-Pupil Spending                  -0.001                  
##                                    (0.0004)                 
##                                                             
## Log Spending                                       -1.482   
##                                                   (2.174)   
##                                                             
## Constant                           74.860***     84.644***  
##                                     (2.273)       (18.557)  
##                                                             
## ------------------------------------------------------------
## Observations                         1,815         1,815    
## R2                                   0.001         0.0003   
## Adjusted R2                         0.0004        -0.0003   
## Residual Std. Error (df = 1813)     19.837         19.844   
## F Statistic (df = 1; 1813)           1.651         0.464    
## ============================================================
## Note:                            *p<0.1; **p<0.05; ***p<0.01
# === Model Comparison Table ===
model_comp <- data.frame(
  Model = c("Linear Spending", "Log Spending"),
  R2 = c(summary(q6_linear)$r.squared, summary(q6_log)$r.squared),
  Adj_R2 = c(summary(q6_linear)$adj.r.squared, summary(q6_log)$adj.r.squared),
  AIC = c(AIC(q6_linear), AIC(q6_log)),
  BIC = c(BIC(q6_linear), BIC(q6_log))
)
cat("\nModel Comparison Table:\n")
## 
## Model Comparison Table:
print(model_comp)
##             Model           R2        Adj_R2      AIC      BIC
## 1 Linear Spending 0.0009097360  0.0003586658 15999.63 16016.14
## 2    Log Spending 0.0002561006 -0.0002953301 16000.82 16017.33
# Research Question 3 Results
cat("\n\n===================== RESEARCH QUESTION 3 =====================\n")
## 
## 
## ===================== RESEARCH QUESTION 3 =====================
print(q3_ttest)
## 
##  Welch Two Sample t-test
## 
## data:  math_score by high_poverty
## t = 22.61, df = 1446.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  16.99486 20.22388
## sample estimates:
## mean in group 0 mean in group 1 
##        81.30088        62.69151
# Visualization for Q3
q3_plot <- ggplot(df, aes(x = factor(high_poverty), y = math_score)) +
  geom_boxplot() +
  labs(x = "High Poverty School", y = "Math Score", 
       title = "Math Performance by Poverty Status") +
  scale_x_discrete(labels = c("Low Poverty", "High Poverty"))
print(q3_plot)

# Visualization for Q5
cat("\n\nDiminishing Returns Visualization:\n")
## 
## 
## Diminishing Returns Visualization:
q5_plot <- ggplot(df, aes(x = exppp, y = math_score)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", formula = y ~ poly(log(x), 2), color = "blue") +
  geom_vline(xintercept = optimal_spending, linetype = "dashed", color = "red") +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Diminishing Returns to Educational Spending",
       subtitle = paste("Optimal spending around $", round(optimal_spending, 0)),
       x = "Per-Pupil Expenditure (USD)",
       y = "Math Proficiency (%)") +
  theme_minimal()
print(q5_plot)

RQ1: Effects of Poverty on Spending

Poverty (measured via free lunch rates) is strongly negatively associated with math performance.

Log per-pupil spending is positively associated with scores, but the effect weakens at the district level.

Spending improves outcomes to a point, but poverty remains a dominant factor.

RQ2: Control for Spending Quartiles

Even when controlling for spending quartiles, poverty remains a significant negative predictor of math scores.

Quartile variables are not consistently significant, suggesting that high spending alone does not ensure better outcomes.

RQ3: High vs. Low Poverty Schools

T-tests and boxplots show high-poverty schools have much lower average math performance.

Poverty appears to be a structural barrier to achievement.

RQ4: Demographics in Spending Groups

Controlling for enrollment and spending quartiles, poverty remains a strong negative factor.

Larger schools slightly underperform, implying that school size and resource strain affect outcomes.

RQ5: Diminishing Returns on Spending

Math scores increase with spending up to ~$4,440 per student.

After that, returns diminish or even reverse, showing that spending efficiency matters more than quantity.

RQ6: Linear and Log Spending Models

Linear and log models of spending have low explanatory power (low R²).

Poverty and school conditions explain performance better than spending alone.

Other regression models

# Table for Q1-Q2 models
stargazer(q1_model, q1_district, q2_model,
          title = "Poverty and Spending Effects on Math Scores",
          column.labels = c("School Level", "District Level", "With Spending Controls"),
          covariate.labels = c("Poverty Rate", "Log Spending", "Spending Q2", "Spending Q3", "Spending Q4"),
          dep.var.labels = "Math Proficiency (%)",
          type = "text")
## 
## Poverty and Spending Effects on Math Scores
## =======================================================================================================
##                                                        Dependent variable:                             
##                            ----------------------------------------------------------------------------
##                              Math Proficiency (%)         district_math              math_score        
##                                  School Level             District Level       With Spending Controls  
##                                       (1)                      (2)                       (3)           
## -------------------------------------------------------------------------------------------------------
## Poverty Rate                       -0.468***                                          -0.469***        
##                                     (0.014)                                            (0.014)         
##                                                                                                        
## Log Spending                       11.354***                                           6.802*          
##                                     (1.772)                                            (3.521)         
##                                                                                                        
## Spending Q2                                                 -0.424***                                  
##                                                              (0.028)                                   
##                                                                                                        
## Spending Q3                                                  0.002**                                   
##                                                              (0.001)                                   
##                                                                                                        
## Spending Q4                                                                            2.213*          
##                                                                                        (1.258)         
##                                                                                                        
## factor(spending_quartile)3                                                             3.527**         
##                                                                                        (1.545)         
##                                                                                                        
## factor(spending_quartile)4                                                              2.933          
##                                                                                        (2.098)         
##                                                                                                        
## Constant                            -6.556                  81.902***                  30.145          
##                                    (15.007)                  (4.090)                  (29.136)         
##                                                                                                        
## -------------------------------------------------------------------------------------------------------
## Observations                         1,815                     192                      1,815          
## R2                                   0.369                    0.547                     0.371          
## Adjusted R2                          0.369                    0.542                     0.370          
## Residual Std. Error           15.765 (df = 1812)         8.095 (df = 189)        15.752 (df = 1809)    
## F Statistic                530.557*** (df = 2; 1812) 113.985*** (df = 2; 189) 213.786*** (df = 5; 1809)
## =======================================================================================================
## Note:                                                                       *p<0.1; **p<0.05; ***p<0.01
# Q3 Results
cat("\nMath Score Difference (High vs Low Poverty):\n")
## 
## Math Score Difference (High vs Low Poverty):
print(q3_ttest)
## 
##  Welch Two Sample t-test
## 
## data:  math_score by high_poverty
## t = 22.61, df = 1446.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  16.99486 20.22388
## sample estimates:
## mean in group 0 mean in group 1 
##        81.30088        62.69151
print(q3_plot)

# Q4 Results
cat("\nDemographic Effects Within Spending Groups:\n")
## 
## Demographic Effects Within Spending Groups:
stargazer(q4_model, type = "text")
## 
## ======================================================
##                                Dependent variable:    
##                            ---------------------------
##                                    math_score         
## ------------------------------------------------------
## lunch                               -0.469***         
##                                      (0.014)          
##                                                       
## log_enroll                          -5.369***         
##                                      (0.915)          
##                                                       
## factor(spending_quartile)2          2.864***          
##                                      (1.044)          
##                                                       
## factor(spending_quartile)3          4.670***          
##                                      (1.059)          
##                                                       
## factor(spending_quartile)4          5.041***          
##                                      (1.090)          
##                                                       
## Constant                           118.980***         
##                                      (5.621)          
##                                                       
## ------------------------------------------------------
## Observations                          1,815           
## R2                                    0.382           
## Adjusted R2                           0.380           
## Residual Std. Error            15.621 (df = 1809)     
## F Statistic                 223.526*** (df = 5; 1809) 
## ======================================================
## Note:                      *p<0.1; **p<0.05; ***p<0.01
# Q5 Results
cat("\n\n--- Diminishing Returns to Spending ---\n")
## 
## 
## --- Diminishing Returns to Spending ---
cat("Optimal spending point: $", round(optimal_spending, 0), " per pupil\n", sep = "")
## Optimal spending point: $4440 per pupil
stargazer(q5_model, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                             math_score         
## -----------------------------------------------
## log_spending                205.155***         
##                              (71.832)          
##                                                
## I(log_spending2)            -12.214***         
##                               (4.244)          
##                                                
## Constant                    -788.707***        
##                              (304.023)         
##                                                
## -----------------------------------------------
## Observations                   1,815           
## R2                             0.005           
## Adjusted R2                    0.004           
## Residual Std. Error     19.804 (df = 1812)     
## F Statistic           4.375** (df = 2; 1812)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# Create and display the plot
diminishing_returns_plot <- ggplot(df, aes(x = exppp, y = math_score)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", formula = y ~ poly(log(x), 2), color = "blue") +
  geom_vline(xintercept = optimal_spending, linetype = "dashed", color = "red") +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Diminishing Returns to Educational Spending",
       subtitle = paste("Optimal spending around $", round(optimal_spending, 0)),
       x = "Per-Pupil Expenditure (USD)",
       y = "Math Proficiency (%)") +
  theme_minimal()
print(diminishing_returns_plot)

# Q6 Results
cat("\n\n--- Model Comparison: Linear vs Log Spending ---\n")
## 
## 
## --- Model Comparison: Linear vs Log Spending ---
kable(model_comp, digits = 3) %>% print()
## 
## 
## |Model           |    R2| Adj_R2|      AIC|      BIC|
## |:---------------|-----:|------:|--------:|--------:|
## |Linear Spending | 0.001|      0| 15999.63| 16016.14|
## |Log Spending    | 0.000|      0| 16000.82| 16017.33|
# Diagnostic Checks
cat("\n\n--- Model Diagnostics ---\n")
## 
## 
## --- Model Diagnostics ---
cat("VIF for Multicollinearity in Q2 Model:\n")
## VIF for Multicollinearity in Q2 Model:
print(vif(q2_model))
##                               GVIF Df GVIF^(1/(2*Df))
## lunch                     1.070904  1        1.034845
## log_spending              4.162306  1        2.040173
## factor(spending_quartile) 4.234261  3        1.271929
cat("\nRobust Standard Errors for Q1 Model:\n")
## 
## Robust Standard Errors for Q1 Model:
print(coeftest(q1_model, vcov = vcovHC(q1_model, type = "HC1")))
## 
## t test of coefficients:
## 
##               Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)  -6.555883  17.075820  -0.3839    0.7011    
## lunch        -0.467901   0.016472 -28.4053 < 2.2e-16 ***
## log_spending 11.353860   2.011145   5.6455 1.908e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Other regression models

# Load library
library(stargazer)

# ===================== DISPLAY REGRESSION RESULTS =====================

# === RESEARCH QUESTION 1 ===
stargazer(q1_model, q1_district,
          type = "text",
          title = "RQ1: Poverty and Spending Effects (School vs District)",
          column.labels = c("School-Level", "District-Level"),
          covariate.labels = c("Poverty Rate", "Log Spending", "District Poverty", "District Spending"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ1: Poverty and Spending Effects (School vs District)
## ======================================================================
##                                    Dependent variable:                
##                     --------------------------------------------------
##                       Math Proficiency (%)         district_math      
##                           School-Level             District-Level     
##                                (1)                      (2)           
## ----------------------------------------------------------------------
## Poverty Rate                -0.468***                                 
##                              (0.014)                                  
##                                                                       
## Log Spending                11.354***                                 
##                              (1.772)                                  
##                                                                       
## District Poverty                                     -0.424***        
##                                                       (0.028)         
##                                                                       
## District Spending                                     0.002**         
##                                                       (0.001)         
##                                                                       
## Constant                     -6.556                  81.902***        
##                             (15.007)                  (4.090)         
##                                                                       
## ----------------------------------------------------------------------
## Observations                  1,815                     192           
## R2                            0.369                    0.547          
## Adjusted R2                   0.369                    0.542          
## Residual Std. Error    15.765 (df = 1812)         8.095 (df = 189)    
## F Statistic         530.557*** (df = 2; 1812) 113.985*** (df = 2; 189)
## ======================================================================
## Note:                                      *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 2 ===
stargazer(q2_model,
          type = "text",
          title = "RQ2: Poverty Effect Controlling for Spending Quartiles",
          covariate.labels = c("Poverty Rate", "Log Spending", 
                               "Spending Q2", "Spending Q3", "Spending Q4"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ2: Poverty Effect Controlling for Spending Quartiles
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                        Math Proficiency (%)    
## -----------------------------------------------
## Poverty Rate                 -0.469***         
##                               (0.014)          
##                                                
## Log Spending                  6.802*           
##                               (3.521)          
##                                                
## Spending Q2                   2.213*           
##                               (1.258)          
##                                                
## Spending Q3                   3.527**          
##                               (1.545)          
##                                                
## Spending Q4                    2.933           
##                               (2.098)          
##                                                
## Constant                      30.145           
##                              (29.136)          
##                                                
## -----------------------------------------------
## Observations                   1,815           
## R2                             0.371           
## Adjusted R2                    0.370           
## Residual Std. Error     15.752 (df = 1809)     
## F Statistic          213.786*** (df = 5; 1809) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 4 ===
stargazer(q4_model,
          type = "text",
          title = "RQ4: Demographic Effects Within Spending Groups",
          covariate.labels = c("Poverty Rate", "Log Enrollment", 
                               "Spending Q2", "Spending Q3", "Spending Q4"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ4: Demographic Effects Within Spending Groups
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                        Math Proficiency (%)    
## -----------------------------------------------
## Poverty Rate                 -0.469***         
##                               (0.014)          
##                                                
## Log Enrollment               -5.369***         
##                               (0.915)          
##                                                
## Spending Q2                  2.864***          
##                               (1.044)          
##                                                
## Spending Q3                  4.670***          
##                               (1.059)          
##                                                
## Spending Q4                  5.041***          
##                               (1.090)          
##                                                
## Constant                    118.980***         
##                               (5.621)          
##                                                
## -----------------------------------------------
## Observations                   1,815           
## R2                             0.382           
## Adjusted R2                    0.380           
## Residual Std. Error     15.621 (df = 1809)     
## F Statistic          223.526*** (df = 5; 1809) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# === RESEARCH QUESTION 5 ===
stargazer(q5_model,
          type = "text",
          title = "RQ5: Diminishing Returns to Spending",
          covariate.labels = c("Log Spending", "Log Spending Squared"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ5: Diminishing Returns to Spending
## ================================================
##                          Dependent variable:    
##                      ---------------------------
##                         Math Proficiency (%)    
## ------------------------------------------------
## Log Spending                 205.155***         
##                               (71.832)          
##                                                 
## Log Spending Squared         -12.214***         
##                                (4.244)          
##                                                 
## Constant                     -788.707***        
##                               (304.023)         
##                                                 
## ------------------------------------------------
## Observations                    1,815           
## R2                              0.005           
## Adjusted R2                     0.004           
## Residual Std. Error      19.804 (df = 1812)     
## F Statistic            4.375** (df = 2; 1812)   
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
# Display Optimal Spending
inflection_point <- -coef(q5_model)[2] / (2 * coef(q5_model)[3])
optimal_spending <- exp(inflection_point)
cat("\nOptimal Spending Point: $", round(optimal_spending, 0), "per pupil\n")
## 
## Optimal Spending Point: $ 4440 per pupil
# === RESEARCH QUESTION 6 ===
stargazer(q6_linear, q6_log,
          type = "text",
          title = "RQ6: Log vs Linear Spending Models",
          column.labels = c("Linear", "Log"),
          covariate.labels = c("Per-Pupil Spending", "Log Spending"),
          dep.var.labels = "Math Proficiency (%)",
          align = TRUE, digits = 3)
## 
## RQ6: Log vs Linear Spending Models
## ============================================================
##                                     Dependent variable:     
##                                 ----------------------------
##                                     Math Proficiency (%)    
##                                     Linear          Log     
##                                       (1)           (2)     
## ------------------------------------------------------------
## Per-Pupil Spending                  -0.001                  
##                                    (0.0004)                 
##                                                             
## Log Spending                                       -1.482   
##                                                   (2.174)   
##                                                             
## Constant                           74.860***     84.644***  
##                                     (2.273)       (18.557)  
##                                                             
## ------------------------------------------------------------
## Observations                         1,815         1,815    
## R2                                   0.001         0.0003   
## Adjusted R2                         0.0004        -0.0003   
## Residual Std. Error (df = 1813)     19.837         19.844   
## F Statistic (df = 1; 1813)           1.651         0.464    
## ============================================================
## Note:                            *p<0.1; **p<0.05; ***p<0.01
# === Model Comparison Table ===
model_comp <- data.frame(
  Model = c("Linear Spending", "Log Spending"),
  R2 = c(summary(q6_linear)$r.squared, summary(q6_log)$r.squared),
  Adj_R2 = c(summary(q6_linear)$adj.r.squared, summary(q6_log)$adj.r.squared),
  AIC = c(AIC(q6_linear), AIC(q6_log)),
  BIC = c(BIC(q6_linear), BIC(q6_log))
)
cat("\nModel Comparison Table:\n")
## 
## Model Comparison Table:
print(model_comp)
##             Model           R2        Adj_R2      AIC      BIC
## 1 Linear Spending 0.0009097360  0.0003586658 15999.63 16016.14
## 2    Log Spending 0.0002561006 -0.0002953301 16000.82 16017.33
# === RESEARCH QUESTION 3 ===
cat("\n\nRQ3: Performance Gap by Poverty Status:\n")
## 
## 
## RQ3: Performance Gap by Poverty Status:
print(q3_ttest)
## 
##  Welch Two Sample t-test
## 
## data:  math_score by high_poverty
## t = 22.61, df = 1446.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  16.99486 20.22388
## sample estimates:
## mean in group 0 mean in group 1 
##        81.30088        62.69151
# === Q3 Plot ===
print(q3_plot)

# === Q5 Plot ===
print(q5_plot)